Import files

# Import train.csv, test.csv and store.csv
train <- read.csv("train.csv", stringsAsFactors = F)
test <- read.csv("test.csv", stringsAsFactors = F)
store <- read.csv("store.csv", stringsAsFactors = F)

Data Cleaning and Exploratory Data Analysis

1. Data Structure

No. of Rows No. of Columns
Train 1017209 9
Test 41088 8
Store 1115 10

1.1 Convert data types

  • Insert explanation for the type conversion here
#a. Train
train <- train %>% mutate(
  DayOfWeek                 = as.factor(DayOfWeek),
  Date                      = as.Date(Date),
  Open                      = as.factor(Open),
  Promo                     = as.factor(Promo), 
  StateHoliday              = as.factor(StateHoliday),   # Has 4 values!
  SchoolHoliday             = as.factor(SchoolHoliday))

str(train)
## 'data.frame':    1017209 obs. of  9 variables:
##  $ Store        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DayOfWeek    : Factor w/ 7 levels "1","2","3","4",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ Date         : Date, format: "2015-07-31" "2015-07-31" ...
##  $ Sales        : int  5263 6064 8314 13995 4822 5651 15344 8492 8565 7185 ...
##  $ Customers    : int  555 625 821 1498 559 589 1414 833 687 681 ...
##  $ Open         : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Promo        : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ StateHoliday : Factor w/ 4 levels "0","a","b","c": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SchoolHoliday: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
#b. Test
test <- test %>% mutate(
  DayOfWeek                 = as.factor(DayOfWeek),
  Date                      = as.Date(Date),
  Open                      = as.factor(Open),
  Promo                     = as.factor(Promo),
  StateHoliday              = as.factor(StateHoliday),   # Only 2 values! What're the state holidays?
  SchoolHoliday             = as.factor(SchoolHoliday))
 str(test)
## 'data.frame':    41088 obs. of  8 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Store        : int  1 3 7 8 9 10 11 12 13 14 ...
##  $ DayOfWeek    : Factor w/ 7 levels "1","2","3","4",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ Date         : Date, format: "2015-09-17" "2015-09-17" ...
##  $ Open         : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Promo        : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ StateHoliday : Factor w/ 2 levels "0","a": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SchoolHoliday: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
#c. Store
store <- store %>% mutate(
  StoreType                 = as.factor(StoreType),
  Assortment                = as.factor(Assortment),
  Promo2                    = as.factor(Promo2),
  PromoInterval             = as.factor(PromoInterval),
  CompetitionOpenSinceMonth = as.numeric(CompetitionOpenSinceMonth),
  CompetitionOpenSinceYear  = as.numeric(CompetitionOpenSinceYear))
str(store)
## 'data.frame':    1115 obs. of  10 variables:
##  $ Store                    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ StoreType                : Factor w/ 4 levels "a","b","c","d": 3 1 1 3 1 1 1 1 1 1 ...
##  $ Assortment               : Factor w/ 3 levels "a","b","c": 1 1 1 3 1 1 3 1 3 1 ...
##  $ CompetitionDistance      : int  1270 570 14130 620 29910 310 24000 7520 2030 3160 ...
##  $ CompetitionOpenSinceMonth: num  9 11 12 9 4 12 4 10 8 9 ...
##  $ CompetitionOpenSinceYear : num  2008 2007 2006 2009 2015 ...
##  $ Promo2                   : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 1 1 1 ...
##  $ Promo2SinceWeek          : int  NA 13 14 NA NA NA NA NA NA NA ...
##  $ Promo2SinceYear          : int  NA 2010 2011 NA NA NA NA NA NA NA ...
##  $ PromoInterval            : Factor w/ 4 levels "","Feb,May,Aug,Nov",..: 1 3 3 1 1 1 1 1 1 1 ...

2. Check and impute values for NA

2.1 Train and Test

No. of NAs
Store 0
DayOfWeek 0
Date 0
Sales 0
Customers 0
Open 0
Promo 0
StateHoliday 0
SchoolHoliday 0
No. of NAs
Id 0
Store 0
DayOfWeek 0
Date 0
Open 11
Promo 0
StateHoliday 0
SchoolHoliday 0

Observations:

  • In the test set, variable ‘Open’ should have only two possible values (Open = 1 or Closed = 0), so the 11 NA’s should be changed to either 1 or 0.
  • If Open is = 1, but we assume = 0, the error score will increase because of misprediction.
  • If Open is = 0, but we assume = 1, then there’s no penalty in scoring as closed stores with 0 sales are not considered in scoring.

Hence, we will impute 1 into the NA values for the ‘Open’ variable in the test dataset.

Test - Impute values for Open = NA

# a. Retrieve records with Open = NA
test %>% filter(is.na(Open)) %>% html_df()
Id Store DayOfWeek Date Open Promo StateHoliday SchoolHoliday
480 622 4 2015-09-17 NA 1 0 0
1336 622 3 2015-09-16 NA 1 0 0
2192 622 2 2015-09-15 NA 1 0 0
3048 622 1 2015-09-14 NA 1 0 0
4760 622 6 2015-09-12 NA 0 0 0
5616 622 5 2015-09-11 NA 0 0 0
6472 622 4 2015-09-10 NA 0 0 0
7328 622 3 2015-09-09 NA 0 0 0
8184 622 2 2015-09-08 NA 0 0 0
9040 622 1 2015-09-07 NA 0 0 0
10752 622 6 2015-09-05 NA 0 0 0
# b. Impute NA with Open = 1
test <- test %>% mutate(Open = replace(Open, is.na(Open),1))

# c. Check if NA has been replaced:
sum(is.na(test$Open))
## [1] 0

2.2 Store

No. of NAs
Store 0
StoreType 0
Assortment 0
CompetitionDistance 3
CompetitionOpenSinceMonth 354
CompetitionOpenSinceYear 354
Promo2 0
Promo2SinceWeek 544
Promo2SinceYear 544
PromoInterval 0

Store - Impute values for CompetitionDistance

With mean

# Impute NA with mean of CompetitionDistance 
store <- store %>% 
  mutate(CompetitionDistance= replace(CompetitionDistance, is.na(CompetitionDistance), mean(CompetitionDistance,na.rm=T)))
          
# Check for NAs
store %>% filter(is.na(CompetitionDistance)) %>% nrow() 
## [1] 0

Store - Impute values for CompetitionOpenSince and Promo2Since

# Impute NA with median   
store <- store %>%
          mutate(CompetitionOpenSinceMonth=ifelse(is.na(CompetitionOpenSinceMonth),median(CompetitionOpenSinceMonth,na.rm=T), CompetitionOpenSinceMonth),
                 CompetitionOpenSinceYear=ifelse(is.na(CompetitionOpenSinceYear),median(CompetitionOpenSinceYear,na.rm=T), CompetitionOpenSinceYear),
                 Promo2SinceYear=ifelse(is.na(Promo2SinceYear),0, Promo2SinceYear),
                 Promo2SinceWeek=ifelse(is.na(Promo2SinceWeek),0, Promo2SinceWeek))

# Check no. of NAs for CompetitionOpenSinceMonth/Year
store %>% is.na() %>% colSums() %>% data.frame() %>% `colnames<-`("No. of NAs") %>% html_df
No. of NAs
Store 0
StoreType 0
Assortment 0
CompetitionDistance 0
CompetitionOpenSinceMonth 0
CompetitionOpenSinceYear 0
Promo2 0
Promo2SinceWeek 0
Promo2SinceYear 0
PromoInterval 0

3. Completeness Check - Dates

plot(train$Date, type = "l") 

plot(test$Date, type = "l") 

No visible breaks in data, hence no missing data by date.

## Find missing data ##

# Expected rows of records (1115 x 941 days) = 1,049,215  vs Actual = 1,017,209. Missing records = 33,121

# 1. Finding all combinations of stores and dates
allStoresAndDates <- expand.grid(unique(train.store$Store), unique(train.store$Date))
# Explanation
 # - List all permutations of stores (1,115) and dates (971 days) 


# 2. Naming the two columns in the newly created dataframe for step 3
names(allStoresAndDates) <- c("Store", "Date")


# 3. Extract stores with missing dates and consequently sales data.
missingDatesForStores <-  anti_join(allStoresAndDates, train.store, by = c("Store", "Date"))
# Explanation
  # - anti_join is a dplyr function that finds unmatched records.
  # - 1st parameter = "Master table"
  # - 2nd parameter = Comparison table
  # - Function checks "train.store" records against "allStoresAndDates" and for 
  # records that train.store do not have, show it as an output.
# Actual missing records do not equate to expected missing records as some stores may only be opened after the start date.


## Note: Ignoring missing data ##
# As per competition host, Florian, "The missing data you’re observing for a 6 month period in 2014 was a mistake done by us. For some stores this data was simply not included in the train-set. We’ve discussed this with Kaggle and decided that it’s an insignificant omission as there are still more than enough store/date combinations left to create a model on.""

## Conclusion: Proceed to find other missing data.

Exploratory Data Analysis

Data Merging (Take note of what enters in afterwards)

train.store <- merge(train, store, by = "Store")
test.store <- merge(test, store, by = "Store")

1. Day of week (YC)

Sunday has the least sales for all opened stores over the data period, and that could be because most stores are closed on Sundays.

# Check if closed stores have any sales. Result = no anomalies.
train.closed <- train[train$Open == 0,]
train.closed$Sales %>% sum()  
## [1] 0
# First plot
ggplot(data = train, aes (x= DayOfWeek, y= Sales)) +
geom_bar(stat = "identity")

# Second plot
train %>% group_by(DayOfWeek, Open) %>% tally() %>%
  ggplot(aes(x=DayOfWeek, y=n, fill = Open)) +
  geom_bar(stat="identity")

2. PromotionInterval, StoreType, Assortment Analysis (YC)

# PromotionInterval
ggplot(train.store, aes(x = factor(PromoInterval), y = Sales, color = PromoInterval)) +
    geom_col() +
    ggtitle("Sales by PromoInterval")

# StoreType
ggplot(train.store, aes(x = Date, y = Sales, color = StoreType))+ 
    geom_smooth(se= F, size = 1.5) +
    ggtitle("Sales by StoreType")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(train.store, aes(x = Date, y = Customers, color = StoreType)) + 
    geom_smooth(se= F, size = 1.5) +
    ggtitle("Customers by StoreType")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# Assortment
ggplot(train.store, aes(x = Date, y = Sales, color = Assortment)) + 
    geom_smooth(se= F, size = 1.5) +
    ggtitle("Sales by Assortment")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(train.store, aes(x = Date, y = Customers, color = Assortment)) + 
    geom_smooth(se= F, size = 1.5) +
    ggtitle("Customers by Assortment")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## 3. CompetitionDistance, OpenSinceMonth/Year

# Combine year and month into one date variable:
store$CompetitionOpenSince <-as.yearmon(paste(store$CompetitionOpenSinceYear, 
                                               store$CompetitionOpenSinceMonth, sep = "-"))

# P.S: yearmon functon creates a numeric vector interpreted in "years" and fractions of years. e.g. 1961.5 = June 1961.

# Histogram for CompetitionOpenedSince
plot_ly(x= store$CompetitionOpenSince, type = "histogram") %>%
layout(title = "Distribution of CompetitionOpenedSince",
         xaxis = list(title = "Year",
                      zeroline = FALSE),
         yaxis = list(title = "Count",
                      zeroline = FALSE))

Observations: Many competitors opened recently, except 1 that opened in 1900 and 1 in 1961.

4. Promo2, Promo2Since Week/Year

# Combine year and month into one date variable:
store$Promo2Since <- as.POSIXct(paste(store$Promo2SinceYear, 
                                   store$Promo2SinceWeek, 1, sep = "-"),
                             format = "%Y-%U-%u")

hist(as.numeric(as.POSIXct("2015-10-01", format = "%Y-%m-%d") - store$Promo2Since), 
     100, main = "Days since start of promo2")

# Histogram for Promo2Since (in days)
plot_ly(x= as.POSIXct("2015-10-01", format = "%Y-%m-%d") - store$Promo2Since, type = "histogram") %>%
layout(title = "Distribution of Promo2Since",
         xaxis = list(title = "Days",
                      zeroline = FALSE),
         yaxis = list(title = "Count",
                      zeroline = FALSE))
## Warning: Ignoring 544 observations

5. Competition Distance

# MeanSales by CompetitionDistance
salesbydist <- train.store %>% group_by(CompetitionDistance) %>% summarise(MeanSales = mean(Sales, na.rm=TRUE))

## NOTE: Plotting without mean makes everthing too cluttered. Code below can't see shit. Followed online guide.
## ggplot(train.store, aes(x = CompetitionDistance, y = Sales)) + geom_point() + geom_smooth() 

# salesbydist scatterplot 

ggplot(salesbydist, aes(x = CompetitionDistance, y = MeanSales)) + 
    geom_point() + geom_smooth() + scale_x_log10() + scale_y_log10()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Observations: Interestingly, stores with competition that are closer have slightly higher sales on average while those with competition that are further have slightly lower sales. Just based on this graph alone, we cannot deduce much, but a possibility is that the stores with close competitors are situated in areas with high footfall such as cities, contributing to slightly higher revenue.

train.store <- merge(train, store, by = "Store")

train.store2 <- train.store %>% dplyr:: select(
  DayOfWeek, #1
  Sales,     #2
  Customers, #3
  Open,      #4
  Promo,     #5
  StateHoliday, #6
  SchoolHoliday, #7 
  StoreType,   #8
  Assortment,  #9
  CompetitionDistance,  #10
  Promo2,               #11
  PromoInterval,        #12
  CompetitionOpenSince) #13
  #Promo2Since)          #14
str(train.store2)
## 'data.frame':    1017209 obs. of  13 variables:
##  $ DayOfWeek           : Factor w/ 7 levels "1","2","3","4",..: 5 6 5 3 3 7 3 1 5 1 ...
##  $ Sales               : int  5263 4952 4190 6454 3310 0 3591 4770 3836 3722 ...
##  $ Customers           : int  555 646 552 695 464 0 453 542 466 480 ...
##  $ Open                : Factor w/ 2 levels "0","1": 2 2 2 2 2 1 2 2 2 2 ...
##  $ Promo               : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 1 2 1 1 ...
##  $ StateHoliday        : Factor w/ 4 levels "0","a","b","c": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SchoolHoliday       : Factor w/ 2 levels "0","1": 2 1 2 1 1 1 1 1 1 1 ...
##  $ StoreType           : Factor w/ 4 levels "a","b","c","d": 3 3 3 3 3 3 3 3 3 3 ...
##  $ Assortment          : Factor w/ 3 levels "a","b","c": 1 1 1 1 1 1 1 1 1 1 ...
##  $ CompetitionDistance : num  1270 1270 1270 1270 1270 1270 1270 1270 1270 1270 ...
##  $ Promo2              : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PromoInterval       : Factor w/ 4 levels "","Feb,May,Aug,Nov",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ CompetitionOpenSince: 'yearmon' num  Sep 2008 Sep 2008 Sep 2008 Sep 2008 ...

Model creation

1. Using step-wise regression to select best variables

# Run lm first
train.mlm <- lm(Sales ~.,  data = train.store2)
str(train.store)

# Ultimate step-wise regression...is useless in feature selection here...
training.swr <- step(train.mlm, direction = "both")
summary(training.swr) 

Observation: All variables are significant with stepwise regression. Proceed to classification trees for prediction

2. Using decision trees to predict sales

# Decision Tree with Rpart function
train.dt <- rpart(Sales ~., data = train.store2, control = rpart.control(cp = 0.0001))

# Choosing the best cp (complexity parameter)
bestcp <- train.dt$cptable[which.min(train.dt$cptable[,"xerror"]),"CP"]
train.dt.pruned <- prune(train.dt, cp = bestcp)

# Confusion matrix
conf.matrix <- table(train.dt.pruned$Sales, predict(train.dt.pruned, na.action = na.pass))

rownames(conf.matrix) <- paste("Actual", rownames(conf.matrix), sep = ":")
colnames(conf.matrix) <- paste("Pred", colnames(conf.matrix), sep = ":")
print(conf.matrix)

# Use training data to predict and assess performance of  model
Train.predict <- predict(train.dt.pruned, train.store2, type = "matrix")     
  
confusionMatrix(table(Train.predict, train.store2$Sales),positive = "1")